Fashion category prediction¶
This notebook predicts fashion categories based on bounding box coordinates using machine learning models with hyperparameter tuning and cross-validation.
In [2]:
import json
import numpy as np
import pandas as pd
import lightgbm as lgb
import time
from sklearn import svm
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
from sklearn.neural_network import MLPClassifier
import plotly.io as pio
pio.renderers.default = 'notebook'
import warnings
warnings.filterwarnings('ignore')
Data collection and exploratory analysis¶
In [3]:
# Load fashion data
with open('fashion.json', 'r') as f:
fashion_data = [json.loads(line) for line in f]
with open('fashion-cat.json', 'r') as f:
fashion_cat = json.load(f)
In [4]:
print(f"Total products: {len(fashion_data)}")
print(f"Total unique categories: {len(fashion_cat)}")
df = pd.DataFrame(fashion_data)
# Add category information
df['category'] = df['product'].map(fashion_cat)
df.head()
Total products: 72198 Total unique categories: 38111
Out[4]:
| product | scene | bbox | category | |
|---|---|---|---|---|
| 0 | 0027e30879ce3d87f82f699f148bff7e | cdab9160072dd1800038227960ff6467 | [0.434097, 0.859363, 0.560254, 1.0] | Apparel & Accessories|Shoes |
| 1 | 0027e30879ce3d87f82f699f148bff7e | 14f59334af4539132981b1324a731067 | [0.175269, 0.527773, 0.621485, 0.924899] | Apparel & Accessories|Shoes |
| 2 | 0027e30879ce3d87f82f699f148bff7e | e7d32df9f45b691afc580808750f73ca | [0.588666, 0.638503, 0.750647, 0.761368] | Apparel & Accessories|Shoes |
| 3 | 0027e30879ce3d87f82f699f148bff7e | c0be585ed21b1a6c6dc9559ebe007ede | [0.276699, 0.757741, 0.400485, 0.876138] | Apparel & Accessories|Shoes |
| 4 | 002a6586b8381b5efd39410657630b44 | 67ed2a06be8a26dc63d7a04d4e1a135f | [0.154545, 0.144809, 0.809091, 0.784153] | Apparel & Accessories|Handbags, Wallets & Cases |
In [5]:
print("Missing values in dataframe:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")
print(f"Percentage of missing values: {(df.isnull().sum().sum() / len(df) * 100):.2f}%")
if 'category' in df.columns:
missing_categories = df['category'].isnull().sum()
print(f"\nMissing categories: {missing_categories} ({missing_categories/len(df)*100:.2f}%)")
if 'bbox' in df.columns:
missing_bbox = df['bbox'].isnull().sum()
print(f"Missing bbox: {missing_bbox} ({missing_bbox/len(df)*100:.2f}%)")
Missing values in dataframe: product 0 scene 0 bbox 0 category 0 dtype: int64 Total missing values: 0 Percentage of missing values: 0.00% Missing categories: 0 (0.00%) Missing bbox: 0 (0.00%)
In [4]:
category_counts = df['category'].value_counts()
print(f"Number of unique categories: {len(category_counts)}")
print(f"\nTop 5 categories:")
print(category_counts.head(5))
fig = px.bar(
x=category_counts.head(5).index,
y=category_counts.head(5).values,
labels={'x': 'category', 'y': 'count'},
title='Top 5 categories by frequency'
)
fig.update_xaxes(tickangle=20)
fig.show()
Number of unique categories: 10 Top 5 categories: category Apparel & Accessories|Shoes 22706 Apparel & Accessories|Clothing|Pants 14289 Apparel & Accessories|Clothing|Shirts & Tops 11957 Apparel & Accessories|Handbags, Wallets & Cases 6322 Apparel & Accessories|Clothing|Outerwear|Coats & Jackets 4788 Name: count, dtype: int64
Feature engineering¶
In [5]:
def extract_features(bbox):
x1, y1, x2, y2 = bbox
features = {
'x1': x1,
'y1': y1,
'x2': x2,
'y2': y2,
}
width = x2 - x1
height = y2 - y1
area = width * height
center_x = (x1 + x2) / 2
center_y = (y1 + y2) / 2
aspect_ratio = width / height
features.update({
'width': width,
'height': height,
'area': area,
'center_x': center_x,
'center_y': center_y,
'aspect_ratio': aspect_ratio,
'diagonal': np.sqrt(width**2 + height**2),
'perimeter': 2 * (width + height)
})
return features
bbox_features = df['bbox'].apply(extract_features)
feature_df = pd.DataFrame(list(bbox_features))
df_features = pd.concat([df[['product', 'category']], feature_df], axis=1)
print(f"Feature columns: {list(feature_df.columns)}")
df_features.head()
Feature columns: ['x1', 'y1', 'x2', 'y2', 'width', 'height', 'area', 'center_x', 'center_y', 'aspect_ratio', 'diagonal', 'perimeter']
Out[5]:
| product | category | x1 | y1 | x2 | y2 | width | height | area | center_x | center_y | aspect_ratio | diagonal | perimeter | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0027e30879ce3d87f82f699f148bff7e | Apparel & Accessories|Shoes | 0.434097 | 0.859363 | 0.560254 | 1.000000 | 0.126157 | 0.140637 | 0.017742 | 0.497175 | 0.929682 | 0.897040 | 0.188929 | 0.533588 |
| 1 | 0027e30879ce3d87f82f699f148bff7e | Apparel & Accessories|Shoes | 0.175269 | 0.527773 | 0.621485 | 0.924899 | 0.446216 | 0.397126 | 0.177204 | 0.398377 | 0.726336 | 1.123613 | 0.597342 | 1.686684 |
| 2 | 0027e30879ce3d87f82f699f148bff7e | Apparel & Accessories|Shoes | 0.588666 | 0.638503 | 0.750647 | 0.761368 | 0.161981 | 0.122865 | 0.019902 | 0.669656 | 0.699936 | 1.318366 | 0.203307 | 0.569692 |
| 3 | 0027e30879ce3d87f82f699f148bff7e | Apparel & Accessories|Shoes | 0.276699 | 0.757741 | 0.400485 | 0.876138 | 0.123786 | 0.118397 | 0.014656 | 0.338592 | 0.816939 | 1.045516 | 0.171292 | 0.484366 |
| 4 | 002a6586b8381b5efd39410657630b44 | Apparel & Accessories|Handbags, Wallets & Cases | 0.154545 | 0.144809 | 0.809091 | 0.784153 | 0.654546 | 0.639344 | 0.418480 | 0.481818 | 0.464481 | 1.023777 | 0.914982 | 2.587780 |
In [6]:
fig = make_subplots(
rows=3, cols=3,
subplot_titles=('width', 'height', 'area', 'center X', 'center Y', 'aspect ratio',
'diagonal', 'perimeter', 'x1'),
specs=[[{"secondary_y": False} for _ in range(3)] for _ in range(3)]
)
features_to_plot = ['width', 'height', 'area', 'center_x', 'center_y',
'aspect_ratio', 'diagonal', 'perimeter', 'x1']
for idx, feature in enumerate(features_to_plot):
row = idx // 3 + 1
col = idx % 3 + 1
fig.add_trace(
go.Histogram(x=feature_df[feature], nbinsx=50, name=feature, showlegend=False),
row=row, col=col
)
fig.update_layout(height=900, title_text="feature distributions")
fig.show()
Pre-processing¶
In [7]:
feature_columns = ['x1', 'y1', 'x2', 'y2', 'width', 'height', 'area',
'center_x', 'center_y', 'aspect_ratio', 'diagonal', 'perimeter']
X = df_features[feature_columns].values
y = df_features['category'].values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of classes: {len(label_encoder.classes_)}")
print(f"\nClass distribution:")
unique, counts = np.unique(y_encoded, return_counts=True)
for cls, count in zip(unique[:10], counts[:10]):
print(f"Class {cls} ({label_encoder.inverse_transform([cls])[0]}): {count} samples")
Number of features: 12 Number of samples: 72198 Number of classes: 10 Class distribution: Class 0 (Apparel & Accessories|Clothing Accessories|Sunglasses): 4577 samples Class 1 (Apparel & Accessories|Clothing|Outerwear|Coats & Jackets): 4788 samples Class 2 (Apparel & Accessories|Clothing|Pants): 14289 samples Class 3 (Apparel & Accessories|Clothing|Shirts & Tops): 11957 samples Class 4 (Apparel & Accessories|Clothing|Shorts): 2752 samples Class 5 (Apparel & Accessories|Clothing|Skirts): 1872 samples Class 6 (Apparel & Accessories|Handbags, Wallets & Cases): 6322 samples Class 7 (Apparel & Accessories|Jewelry|Earrings): 1507 samples Class 8 (Apparel & Accessories|Jewelry|Necklaces): 1428 samples Class 9 (Apparel & Accessories|Shoes): 22706 samples
In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y_encoded, test_size=0.2, stratify=y_encoded
)
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Feature dimensions: {X_train.shape[1]}")
Training set size: 57758 Test set size: 14440 Feature dimensions: 12
Hyperparameter tuning and modeling¶
In [9]:
models = {
'random forest': {
'model': RandomForestClassifier(n_jobs=-1),
'params': {
'n_estimators': [50, 100],
'max_depth': [10, 20],
'min_samples_split': [2, 5]
}
},
'logistic regression': {
'model': LogisticRegression(max_iter=1000, n_jobs=-1),
'params': {
'C': [0.1, 1],
'solver': ['lbfgs', 'liblinear']
}
},
# 'Neural Network': {
# 'model': MLPClassifier(random_state=42, max_iter=500),
# 'params': {
# 'hidden_layer_sizes': [(50,), (100,)], # Reduced from [(50,), (100,), (50, 50)]
# 'alpha': [0.0001, 0.001], # Reduced from [0.0001, 0.001, 0.01]
# 'learning_rate': ['constant'] # Reduced from ['constant', 'adaptive']
# }
# },
# 'SVM': {
# 'model': svm.SVC(random_state=42),
# 'params': {
# 'C': [1], # Reduced from [0.1, 1, 10]
# 'kernel': ['rbf'] # Reduced from ['linear', 'rbf', 'poly', 'sigmoid']
# }
# },
}
In [10]:
best_models = {}
cv_results_all = {}
kfold = KFold(n_splits=3, shuffle=True)
print(f"Starting hyperparameter tuning with {kfold.n_splits}-fold CV")
print(f"Total models to train: {len(models)}\n")
for idx, (name, model_config) in enumerate(models.items(), 1):
start_time = time.time()
print(f"[{idx}/{len(models)}] Training {name}")
print(f"Start time: {datetime.now().strftime('%H:%M:%S')}")
param_grid = model_config['params']
n_combinations = 1
for param_values in param_grid.values():
n_combinations *= len(param_values)
total_fits = n_combinations * kfold.n_splits
print(f"Parameter combinations: {n_combinations}")
print(f"Total fits (combinations × CV folds): {total_fits}")
print(f"Testing parameters: {param_grid}")
grid_search = GridSearchCV(
model_config['model'],
model_config['params'],
cv=kfold,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
best_models[name] = grid_search.best_estimator_
cv_results_all[name] = {
'best_params': grid_search.best_params_,
'best_score': grid_search.best_score_,
'cv_results': grid_search.cv_results_
}
print(f"\n{name} completed in {elapsed_time:.1f} seconds ({elapsed_time/60:.1f} minutes)")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score (accuracy): {grid_search.best_score_:.4f}")
print(f"End time: {datetime.now().strftime('%H:%M:%S')}")
Starting hyperparameter tuning with 3-fold CV
Total models to train: 2
[1/2] Training random forest
Start time: 22:23:28
Parameter combinations: 8
Total fits (combinations × CV folds): 24
Testing parameters: {'n_estimators': [50, 100], 'max_depth': [10, 20], 'min_samples_split': [2, 5]}
Fitting 3 folds for each of 8 candidates, totalling 24 fits
random forest completed in 27.0 seconds (0.4 minutes)
Best parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Best CV score (accuracy): 0.6405
End time: 22:23:55
[2/2] Training logistic regression
Start time: 22:23:55
Parameter combinations: 4
Total fits (combinations × CV folds): 12
Testing parameters: {'C': [0.1, 1], 'solver': ['lbfgs', 'liblinear']}
Fitting 3 folds for each of 4 candidates, totalling 12 fits
/Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn( /Users/snigdhapodugu/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1271: UserWarning: 'n_jobs' > 1 does not have any effect when 'solver' is set to 'liblinear'. Got 'n_jobs' = 11. warnings.warn(
logistic regression completed in 6.1 seconds (0.1 minutes)
Best parameters: {'C': 1, 'solver': 'lbfgs'}
Best CV score (accuracy): 0.4961
End time: 22:24:01
Cross-validation results¶
In [11]:
cv_scores_data = []
for name, results in cv_results_all.items():
cv_results = results['cv_results']
mean_scores = cv_results['mean_test_score']
std_scores = cv_results['std_test_score']
param_names = list(cv_results['params'][0].keys())
for i, params in enumerate(cv_results['params']):
param_str = ', '.join([f"{k}={v}" for k, v in params.items()])
cv_scores_data.append({
'Model': name,
'Parameters': param_str[:50] + '...' if len(param_str) > 50 else param_str,
'Accuracy': mean_scores[i],
'Std': std_scores[i]
})
cv_df = pd.DataFrame(cv_scores_data)
fig = go.Figure()
for model_name in cv_df['Model'].unique():
model_data = cv_df[cv_df['Model'] == model_name].sort_values('Accuracy', ascending=False)
fig.add_trace(go.Scatter(
x=model_data['Parameters'],
y=model_data['Accuracy'],
error_y=dict(type='data', array=model_data['Std']),
mode='markers+lines',
name=model_name,
text=model_data['Parameters'],
hovertemplate='<b>%{text}</b><br>Accuracy: %{y:.4f}<br>Std: %{customdata:.4f}<extra></extra>',
customdata=model_data['Std']
))
fig.update_layout(
title='cross-validation scores',
xaxis_title='hyperparameter configuration',
yaxis_title='accuracy',
height=600,
xaxis=dict(tickangle=45, tickmode='array', tickvals=[])
)
fig.show()
In [12]:
best_scores = {name: results['best_score'] for name, results in cv_results_all.items()}
fig = go.Figure(data=[
go.Bar(
x=list(best_scores.keys()),
y=list(best_scores.values()),
text=[f'{v:.4f}' for v in best_scores.values()],
textposition='auto',
marker_color='steelblue'
)
])
fig.update_layout(
title='best cross-validation accuracy scores',
xaxis_title='model',
yaxis_title='accuracy',
height=500
)
fig.show()
print("best cross-validation accuracy scores:")
for name, score in sorted(best_scores.items(), key=lambda x: x[1], reverse=True):
print(f" {name}: {score:.4f}")
best cross-validation accuracy scores: random forest: 0.6405 logistic regression: 0.4961
Training and test accuracies¶
In [13]:
train_results = {}
test_results = {}
for name, model in best_models.items():
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
test_results[name] = {
'accuracy': accuracy,
'mse': mse,
'predictions': y_pred
}
for name, model in best_models.items():
# Predictions on training set
y_train_pred = model.predict(X_train)
# Calculate metrics
train_accuracy = accuracy_score(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_results[name] = {
'accuracy': train_accuracy,
'mse': train_mse
}
print(f"{name}:")
print(f"training accuracy: {train_accuracy:.4f}")
print(f"test accuracy: {test_results[name]['accuracy']:.4f}")
print(f"difference: {train_accuracy - test_results[name]['accuracy']:.4f}\n")
random forest: training accuracy: 0.8308 test accuracy: 0.6563 difference: 0.1745 logistic regression: training accuracy: 0.4957 test accuracy: 0.4983 difference: -0.0027
In [14]:
model_names = list(test_results.keys())
train_accuracies = [train_results[name]['accuracy'] for name in model_names]
test_accuracies = [test_results[name]['accuracy'] for name in model_names]
fig = go.Figure()
fig.add_trace(go.Bar(
x=model_names,
y=train_accuracies,
name='Training Accuracy',
marker_color='lightblue',
text=[f'{v:.4f}' for v in train_accuracies],
textposition='auto'
))
fig.add_trace(go.Bar(
x=model_names,
y=test_accuracies,
name='Test Accuracy',
marker_color='lightcoral',
text=[f'{v:.4f}' for v in test_accuracies],
textposition='auto'
))
fig.update_layout(
title='training verus test accuracy comparison',
xaxis_title='model',
yaxis_title='accuracy',
barmode='group',
height=500,
yaxis=dict(range=[0, 1])
)
fig.show()
for name in model_names:
gap = train_results[name]['accuracy'] - test_results[name]['accuracy']
print(f"{name}: {gap:+.4f} ({'Overfitting' if gap > 0.05 else 'Good generalization' if gap < 0.02 else 'Moderate gap'})")
random forest: +0.1745 (Overfitting) logistic regression: -0.0027 (Good generalization)
In [15]:
test_accuracies = [test_results[name]['accuracy'] for name in test_results.keys()]
test_mses = [test_results[name]['mse'] for name in test_results.keys()]
fig = make_subplots(
rows=1, cols=2,
subplot_titles=('test accuracy', 'test mse'),
specs=[[{"secondary_y": False}, {"secondary_y": False}]]
)
fig.add_trace(
go.Bar(x=list(test_results.keys()), y=test_accuracies,
text=[f'{v:.4f}' for v in test_accuracies], textposition='auto',
marker_color='lightgreen', name='accuracy'),
row=1, col=1
)
fig.add_trace(
go.Bar(x=list(test_results.keys()), y=test_mses,
text=[f'{v:.4f}' for v in test_mses], textposition='auto',
marker_color='lightcoral', name='MSE'),
row=1, col=2
)
fig.update_layout(height=500, title_text="model performance on test set", showlegend=False)
fig.update_xaxes(title_text="model", row=1, col=1)
fig.update_xaxes(title_text="model", row=1, col=2)
fig.update_yaxes(title_text="accuracy", row=1, col=1)
fig.update_yaxes(title_text="mse", row=1, col=2)
fig.show()
In [16]:
best_model_name = max(test_results.keys(), key=lambda x: test_results[x]['accuracy'])
best_model = best_models[best_model_name]
print(f"Best model: {best_model_name}")
print(f"Test accuracy: {test_results[best_model_name]['accuracy']:.4f}")
print(f"Test mse: {test_results[best_model_name]['mse']:.4f}")
y_pred_best = test_results[best_model_name]['predictions']
print(f"\nClassification report for {best_model_name}:")
print(classification_report(y_test, y_pred_best,
target_names=[label_encoder.inverse_transform([i])[0]
for i in range(len(label_encoder.classes_))]))
Best model: random forest
Test accuracy: 0.6563
Test mse: 7.7428
Classification report for random forest:
precision recall f1-score support
Apparel & Accessories|Clothing Accessories|Sunglasses 0.72 0.67 0.70 915
Apparel & Accessories|Clothing|Outerwear|Coats & Jackets 0.55 0.31 0.40 958
Apparel & Accessories|Clothing|Pants 0.70 0.79 0.74 2858
Apparel & Accessories|Clothing|Shirts & Tops 0.53 0.56 0.54 2392
Apparel & Accessories|Clothing|Shorts 0.64 0.42 0.51 550
Apparel & Accessories|Clothing|Skirts 0.59 0.20 0.30 374
Apparel & Accessories|Handbags, Wallets & Cases 0.61 0.46 0.52 1265
Apparel & Accessories|Jewelry|Earrings 0.73 0.44 0.55 301
Apparel & Accessories|Jewelry|Necklaces 0.71 0.54 0.61 286
Apparel & Accessories|Shoes 0.70 0.83 0.76 4541
accuracy 0.66 14440
macro avg 0.65 0.52 0.56 14440
weighted avg 0.65 0.66 0.64 14440
Feature importance¶
In [17]:
tree_models = ['random forest']
for model_name in tree_models:
if model_name in best_models:
model = best_models[model_name]
if hasattr(model, 'feature_importances_'):
importances = model.feature_importances_
feature_importance_df = pd.DataFrame({
'feature': feature_columns,
'importance': importances
}).sort_values('importance', ascending=False)
fig = go.Figure(data=[
go.Bar(
x=feature_importance_df['importance'],
y=feature_importance_df['feature'],
orientation='h',
marker_color='steelblue'
)
])
fig.update_layout(
title=f'{model_name} feature importance',
xaxis_title='importance',
yaxis_title='feature',
height=400
)
fig.show()
print(f"{model_name} top 5 most important features:")
print(feature_importance_df.head())
random forest top 5 most important features:
feature importance
1 y1 0.124765
8 center_y 0.112512
3 y2 0.108158
5 height 0.092303
6 area 0.081938
Summary¶
In [18]:
summary_data = {
'Model': list(test_results.keys()),
'CV_accuracy': [best_scores[name] for name in test_results.keys()],
'Test_accuracy': [test_results[name]['accuracy'] for name in test_results.keys()],
'Test_MSE': [test_results[name]['mse'] for name in test_results.keys()]
}
summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Test_accuracy', ascending=False)
print(summary_df.to_string(index=False))
print(f"\nbest model: {best_model_name}")
print(f"test accuracy: {test_results[best_model_name]['accuracy']:.4f}")
print(f"test mse: {test_results[best_model_name]['mse']:.4f}")
Model CV_accuracy Test_accuracy Test_MSE
random forest 0.640535 0.656302 7.742798
logistic regression 0.496104 0.498338 12.345776
best model: random forest
test accuracy: 0.6563
test mse: 7.7428